**********************************
*	Title: vp04.do
*	Date: 17 Sept 2006
*	Author: Zoe McLaren
*	Description: 	
*			1. Group observations by household and reshape data to have one line (observation) per household
*			2. Save new version to be merged with the individual record data
*
**********************************

use "$stata/vpfwgt.dta", clear  

*Identify observations with all variables missing
egen checkvp= robs(relat-curatsc q1-q12)
label var checkvp "number of non-missing variables (to screen out blank obs)"
tab checkvp

*Create VP id numbers
replace vpno=99999 if vpno==.
count if checkvp==0 & vpno==99999  
sort eanumber vpno age-curatsc q1-q12
egen vpid=group(eanumber vpno)
codebook vpid
duplicates report eanumber vpno

*Create variable for number of members in household
egen one = robs(age-languag)  /*don't count as hhmem if variables age-lang are all missing*/
replace one=. if one==0
egen hhmems = count(one), by(vpid)  /*count: counts non-missing obs*/
label var hhmems "number of household members"

*Assign memberids in hh based on age
gsort vpid -age  /*sorts from oldest to youngest in vpid*/
gen memberid=1 if vpid~=vpid[_n-1]
replace memberid=memberid[_n-1]+1 if vpid==vpid[_n-1] & memberid==.

*Drop obs that have no data and mismatched recno-id
*temp=1 if mismatch in recno or id within household.
gen temp=1 if (recno~=recno[_n-1] | id~=id[_n-1]) & vpid==vpid[_n-1]
egen temphh = count(temp), by(vpid)
*All 15 obs with mismatched recno/id within vpid have checkvp==0 (they have no data)
*ZM thinks it's because respondents weren't available on 1st visit but they completed the survey on a subsequent visit
drop if temp==1 & checkvp==0  	/*15 obs dropped*/

*Reshape data to put all vpid obs onto one line of data
drop recno id hhstat persnum projno ma ea fvr houseno superno numhh numresp1 numresp2 reasonr
rename interno interno_hh
drop checkvp  /*checkvp not constant in hh due to various missings*/
drop temp one temphh
foreach var of varlist eatype geotype weight real {
	ren `var' `var'_hh
}
reshape wide sex age relat mstatus race languag evatsch hilev higrad curatsc, i(vpid) j(memberid)
order vpid hhmems eanumber- q12

*Now VP data should be ready to merge with the individual data.
sort eanumber vpno
label data "2004 VP data, reshaped to one line per hh, $S_DATE @ $S_TIME"
save "$data/vp04.dta", replace

exit
